Get the data
suppressMessages({
library(bigsnpr)
library(tidyverse)
library(foreach)
})
celiac <- snp_attach("backingfiles/celiacQC.rds")
G <- celiac$genotypes
CHR <- celiac$map$chromosome
POS <- celiac$map$physical.pos
NCORES <- nb_cores()
Without removing long-range LD regions
system.time(
ind.keep <- snp_clumping(G, CHR, ncores = NCORES)
)
## user system elapsed
## 4.326 0.572 91.602
system.time(
obj.svd <- big_randomSVD(G, snp_scaleBinom(),
ind.col = ind.keep,
ncores = NCORES)
)
## user system elapsed
## 0.138 0.082 33.316
(plot_loadingsA <- plot(obj.svd, type = "loadings", loadings = 1:6, coeff = 1))

oneSNP <- which.max(abs(obj.svd$v[, 4]))
(plot_scoresA <- plot(obj.svd, type = "scores", scores = 4:5, coeff = 2) +
aes(color = as.factor(G[, ind.keep[oneSNP]])) +
labs(title = NULL, color = "Genotype") +
theme(legend.position = c(0.15, 0.15)) +
guides(color = guide_legend(override.aes = list(size = 4))))

Automatically removing long-range LD regions
system.time(
test <- snp_autoSVD(G, CHR, POS, ncores = NCORES)
)
## Phase of clumping at r2 > 0.2.. keep 94517 SNPs.
##
## Iteration 1:
## Computing SVD..
## 3 long-range LD regions were detected..
##
## Iteration 2:
## Computing SVD..
## 2 long-range LD regions were detected..
##
## Iteration 3:
## Computing SVD..
##
## Converged!
## user system elapsed
## 44.480 0.501 277.615
htmlTable::htmlTable(attr(test, "lrldr"), rnames = FALSE,
css.table = "margin-top: 1em;
margin-bottom: 1em;
margin-left: auto;
margin-right: auto",
css.cell = "padding: 5px 5px 5px 5px")
| Chr |
Start |
Stop |
| 2 |
134374210 |
138130055 |
| 6 |
23788094 |
35793692 |
| 8 |
6341567 |
13526256 |
| 3 |
163071168 |
165026940 |
| 14 |
46690298 |
47500807 |
attr(test, "lrldr") %>%
transmute(
Chromosome = Chr,
`Start (Mb)` = Start / 1e6,
`Stop (Mb)` = Stop / 1e6
) %>%
xtable::xtable(digits = 1, caption = "", label = "", auto = TRUE)
% latex table generated in R 3.3.3 by xtable 1.8-2 package
% Thu Sep 7 13:20:14 2017
\begin{table}[ht]
\centering
\begin{tabular}{lrrr}
\hline
& Chromosome & Start (Mb) & Stop (Mb) \\
\hline
1 & 2 & 134.4 & 138.1 \\
2 & 6 & 23.8 & 35.8 \\
3 & 8 & 6.3 & 13.5 \\
4 & 3 & 163.1 & 165.0 \\
5 & 14 & 46.7 & 47.5 \\
\hline
\end{tabular}
\caption{}
\label{}
\end{table}
(plot_loadingsB <- plot(test, type = "loadings", loadings = 1:6, coeff = 1))

# Get population from external files
pop.files <- list.files(path = "../thesis-celiac/Dubois2010_data/",
pattern = "cluster_*",
full.names = TRUE)
pop <- snp_getSampleInfos(celiac, pop.files)[[1]]
pop.names <- c("Netherlands", "Italy", "UK1", "UK2", "Finland")
(plot_scoresB <- plot(test, type = "scores", scores = 4:5, coeff = 2) +
aes(color = pop.names[pop]) +
labs(title = NULL, color = "Population") +
theme(legend.position = c(0.2, 0.75)) +
guides(color = guide_legend(override.aes = list(size = 4))))

# save plots for paper
p <- cowplot::plot_grid(plot_loadingsA, plot_loadingsB, align = "hv",
ncol = 2, labels = c("A", "B"), label_size = 30)
ggsave("figures/loadings.png", width = 1620, height = 920, scale = 1/75)
p <- cowplot::plot_grid(plot_scoresA, plot_scoresB, align = "hv",
ncol = 2, labels = c("A", "B"), label_size = 30)
ggsave("figures/scores.png", width = 1640, height = 750, scale = 1/75)
doParallel::registerDoParallel(NCORES)
nBoot <- 500
tribble(
~name, ~svd,
"When automatically removing long-range LD regions", test,
"When keeping long-range LD regions", obj.svd
) %>%
apply(1, function(method) {
V <- method[["svd"]][["v"]]^2
m <- nrow(V)
foreach(ic = seq_len(nBoot), .combine = 'rbind') %dopar% {
ind <- sort(sample(m, replace = TRUE))
apply(V[ind, ], 2, ineq::Gini)
} %>%
reshape2::melt() %>%
cbind(method = method[["name"]])
}) %>%
Reduce(f = 'rbind') %>%
ggplot(aes(as.factor(Var2), value)) %>%
bigstatsr:::MY_THEME(coeff = 2) +
geom_hline(yintercept = 2 / pi, lty = 2) +
geom_boxplot(aes(color = method), lwd = 1.5) +
labs(y = "Gini coefficient of squared loadings",
x = "Principal Component", color = "") +
theme(legend.position = "top") +
guides(color = guide_legend(ncol = 1))

doParallel::stopImplicitCluster()
ggsave("figures/gini.png", width = 1341, height = 865, scale = 1/75)
Removing long-range LD regions based on predefined table
ind.excl0 <- snp_indLRLDR(CHR, POS)
ind.keep0 <- snp_clumping(G, CHR, exclude = ind.excl0, ncores = NCORES)
obj.svd0 <- big_randomSVD(G, snp_scaleBinom(),
ind.col = ind.keep0,
ncores = NCORES)
corr <- cor(test$u, obj.svd0$u)
rownames(corr) <- colnames(corr) <- paste0("PC", 1:10)
htmlTable::htmlTable(round(100 * corr, 2),
align = paste(c("|", rep("c", 10)), collapse = ""),
css.table = "margin-top: 1em;
margin-bottom: 1em;
margin-left: auto;
margin-right: auto",
css.cell = "padding: 5px 5px 5px 5px")
| |
PC1 |
PC2 |
PC3 |
PC4 |
PC5 |
PC6 |
PC7 |
PC8 |
PC9 |
PC10 |
| PC1 |
99.99 |
0.07 |
0.05 |
0 |
0.01 |
-0.02 |
-0.01 |
-0.01 |
0 |
-0.02 |
| PC2 |
-0.07 |
99.98 |
0.02 |
0.02 |
0.02 |
-0.03 |
0 |
-0.02 |
0 |
-0.04 |
| PC3 |
-0.06 |
-0.02 |
99.88 |
0.18 |
-0.01 |
0.05 |
0.11 |
0.09 |
-0.04 |
-0.11 |
| PC4 |
0 |
-0.01 |
-0.22 |
99.89 |
0.13 |
0.15 |
-0.07 |
0.03 |
-0.02 |
0.09 |
| PC5 |
0 |
0 |
-0.03 |
-0.14 |
99.65 |
-0.47 |
0.35 |
-0.12 |
-0.88 |
0.62 |
| PC6 |
0.02 |
0.04 |
-0.05 |
-0.18 |
0.45 |
99.62 |
0.5 |
-0.44 |
0.12 |
-0.45 |
| PC7 |
0.02 |
0.04 |
-0.14 |
0.01 |
-0.53 |
-0.42 |
98.9 |
3.13 |
-0.82 |
1.94 |
| PC8 |
0 |
-0.01 |
0.29 |
0.09 |
-0.25 |
-0.4 |
3.38 |
-97.96 |
-5.46 |
1.13 |
| PC9 |
0 |
0.02 |
0.09 |
-0.02 |
0.67 |
0.01 |
0.96 |
-5.11 |
96.84 |
9.99 |
| PC10 |
0.01 |
0 |
0.1 |
-0.05 |
-0.45 |
0.31 |
-1.95 |
-0.06 |
-8.19 |
91.64 |
mean(diag(abs(corr)))
## [1] 0.9843461